{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('..')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-05T15:16:57.010681Z",
     "start_time": "2023-09-05T15:16:57.000956Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "from apify_client import ApifyClient\n",
    "from langchain.document_loaders.apify_dataset import ApifyDatasetLoader\n",
    "from config import APIFY_API_TOKEN\n",
    "\n",
    "# when I tested this, I needed to put the token here. The environment variable wouldn't do:\n",
    "apify = ApifyClient(\n",
    "    token=APIFY_API_TOKEN\n",
    ")\n",
    "actor = apify.actor(\n",
    "    actor_id=\"apify/website-content-crawler\",\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-05T15:36:59.378878Z",
     "start_time": "2023-09-05T15:35:58.969781Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [],
   "source": [
    "# initiate run and wait to finish:\n",
    "client_run = actor.call(\n",
    "    run_input={\"startUrls\": [{\"url\": \"https://docs.smith.langchain.com/overview\"}]},\n",
    ")\n",
    "# https://python.langchain.com/en/latest/"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:19:01.254018Z",
     "start_time": "2023-09-06T16:18:28.746831Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NpCG9x1sGpBoXK1zV\n"
     ]
    }
   ],
   "source": [
    "dataset_id = client_run.get(\"defaultDatasetId\")\n",
    "print(dataset_id)  # you can also find the ids at https://console.apify.com/storage/datasets"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:19:01.268040Z",
     "start_time": "2023-09-06T16:19:01.256390Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [],
   "source": [
    "from langchain.schema import Document\n",
    "\n",
    "loader = ApifyDatasetLoader(\n",
    "    dataset_id=dataset_id,\n",
    "    dataset_mapping_function=lambda dataset_item: Document(\n",
    "        page_content=dataset_item[\"text\"], metadata={\"source\": dataset_item[\"url\"]}\n",
    "    ),\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:19:01.434160Z",
     "start_time": "2023-09-06T16:19:01.271858Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "outputs": [],
   "source": [
    "persist_directory = os.path.expanduser(\"~\") + \"/Documents/indexes/langsmith\""
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:35:51.302862Z",
     "start_time": "2023-09-06T16:35:51.288765Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "outputs": [
    {
     "data": {
      "text/plain": "'/Users/ben/Documents/indexes/langsmith'"
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "persist_directory"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:35:55.785723Z",
     "start_time": "2023-09-06T16:35:55.777278Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "outputs": [],
   "source": [
    "from langchain.indexes import VectorstoreIndexCreator\n",
    "from langchain.vectorstores import Chroma\n",
    "\n",
    "# persist index to disk and load it up again\n",
    "index = VectorstoreIndexCreator(\n",
    "    vectorstore_cls=Chroma,\n",
    "    vectorstore_kwargs={\n",
    "        \"persist_directory\": persist_directory\n",
    "    }).from_loaders([loader])"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:36:16.563933Z",
     "start_time": "2023-09-06T16:36:14.958281Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " LangSmith is a tool used to debug LLMs, chains, and agents, track token usage, and collect examples of unexpected outcomes.\n",
      "\n",
      "https://docs.smith.langchain.com/overview\n"
     ]
    }
   ],
   "source": [
    "# Finally, query the vector database:\n",
    "query = \"What is LangSmith?\"\n",
    "result = index.query_with_sources(query)\n",
    "print(result[\"answer\"])\n",
    "print(result[\"sources\"])"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:36:27.022639Z",
     "start_time": "2023-09-06T16:36:24.137849Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "outputs": [],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "\n",
    "# we need embeddings for retrieval\n",
    "embeddings = OpenAIEmbeddings()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:41:28.509265Z",
     "start_time": "2023-09-06T16:41:28.499091Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "outputs": [],
   "source": [
    "# load the index back up:\n",
    "db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:43:42.861667Z",
     "start_time": "2023-09-06T16:43:42.832451Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [],
   "source": [
    "from langchain.chains import FlareChain\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "\n",
    "LLM = ChatOpenAI(\n",
    "    model_name=\"gpt-4\",\n",
    "    temperature=0,\n",
    ")\n",
    "agent = FlareChain.from_llm(\n",
    "    llm=LLM,\n",
    "    retriever=db.as_retriever(),\n",
    "    max_generation_len=1500,\n",
    "    verbose=True,\n",
    "    max_tokens_limit=1500,\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:50:33.934178Z",
     "start_time": "2023-09-06T16:50:33.924826Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\u001B[1m> Entering new FlareChain chain...\u001B[0m\n",
      "\u001B[36;1m\u001B[1;3mCurrent Response: \u001B[0m\n",
      "Prompt after formatting:\n",
      "\u001B[32;1m\u001B[1;3mRespond to the user message using any relevant context. If context is provided, you should ground your answer in that context. Once you're done responding return FINISHED.\n",
      "\n",
      ">>> CONTEXT: \n",
      ">>> USER INPUT: What is LangSmith and what can it do?\n",
      ">>> RESPONSE: \u001B[0m\n",
      "\u001B[33;1m\u001B[1;3mGenerated Questions: ['What are some of the tools and resources that LangSmith offers to aid in language learning?', '\"What is one of the features that LangSmith offers to aid in language learning?\"', '\"What feature does LangSmith provide to help you understand your learning progress and areas for improvement?\"']\u001B[0m\n",
      "\n",
      "\u001B[1m> Finished chain.\u001B[0m\n"
     ]
    },
    {
     "data": {
      "text/plain": "' LangSmith is a platform for building, debugging, and monitoring natural language processing (NLP) applications. It provides a suite of tools for debugging LLMs, chains, agents, tools, and retrievers, as well as for monitoring performance and collecting feedback. It also makes it easy to curate datasets and export them for use in other contexts. '"
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "agent.run(\"What is LangSmith and what can it do?\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-09-06T16:50:46.835654Z",
     "start_time": "2023-09-06T16:50:34.578678Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
